This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Import Libraries
library(aplore3)
library(caret)
library(dplyr)
library(tidyverse)
library(car)
require(ggthemes)
library(glmnet)
library(cowplot)
library(ROCR)
library(GGally)
library(ResourceSelection)
#Import Data
data = glow_bonemed
data = data.frame(data)
dim(data)
[1] 500 18
#Split train and test set
set.seed(66)
trainIndex <- createDataPartition(data$fracture, p = .8,
list = FALSE,
times = 1)
train <- data[trainIndex,]
test <- data[-trainIndex,]
dim(train)
[1] 400 18
dim(test)
[1] 100 18
All of the EDA will be done in the train data
View head of dataframe
head(train)
Looking at Fracture balance
# look for class imbalance
# The dataset is hevaily imbalance with more No's than Yes
data_classes = data %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
train_classes = train %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
test_classes = test %>% ggplot(aes(x=fracture)) + geom_bar() + theme_fivethirtyeight()
plot_grid(data_classes, train_classes, test_classes, labels = c("Overall Data", "Train Data", "Test Data"))
Let’s look at pair plots from all the numeric variables
train_numeric = train %>% select_if(is.numeric)
pairs(train[,2:8],col=as.factor(train$fracture))
Looking at a different view of pair plots for numerical variables. Excluding id’s
ggpairs(train,columns=4:8,aes(colour=fracture))
Looking at box plot for different numerical variables per fracture or not
boxplot_age = train %>% ggplot(aes(y=age, x=fracture)) + geom_boxplot() + ggtitle("age vs fracture") + theme_fivethirtyeight()
boxplot_weight = train %>% ggplot(aes(y=weight, x=fracture)) + geom_boxplot() + ggtitle("weight vs fracture") + theme_fivethirtyeight()
boxplot_height = train %>% ggplot(aes(y=height, x=fracture)) + geom_boxplot() + ggtitle("height vs fracture") + theme_fivethirtyeight()
boxplot_bmi= train %>% ggplot(aes(y=bmi, x=fracture)) + geom_boxplot() + ggtitle("bmi vs fracture") + theme_fivethirtyeight()
boxplot_fracscore= train %>% ggplot(aes(y=fracscore, x=fracture)) + geom_boxplot() + ggtitle("bmi vs fracture") + theme_fivethirtyeight()
plot_grid(boxplot_age, boxplot_weight, boxplot_height, boxplot_bmi, boxplot_fracscore, nrow=2, ncol=2)
Lets look at bmi vs age per different categorical variables
# relation of bmi and age
age_bim_fracture = train %>% ggplot(aes(x=age, y=bmi, col=fracture)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_premeno = train %>% ggplot(aes(x=age, y=bmi, col=premeno)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_smoke = train %>% ggplot(aes(x=age, y=bmi, col=raterisk)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
age_bim_raterisk = train %>% ggplot(aes(x=age, y=bmi, col=smoke)) + geom_point() + geom_smooth(method = 'loess' , formula = 'y ~ x'
) + ggtitle("bmi vs age") + xlab("age") + ylab("bmi") + theme_minimal()
plot_grid(age_bim_fracture, age_bim_premeno,age_bim_smoke, age_bim_raterisk, nrow=4, ncol=1)
Lets look at different numerica variables vs categorical variables per site id The point os to investigate if site id had any impact
bmi_frac_type = train %>% ggplot(aes(x=fracture, y=bmi, col=as.factor(site_id))) + geom_boxplot() + ggtitle("BMI for fracture type per site id")
age_frac_type = train %>% ggplot(aes(x=fracture, y=age, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Age for fracture type per site id")
weight_frac_type = train %>% ggplot(aes(x=fracture, y=weight, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Weight for fracture type per site id")
height_frac_type = train %>% ggplot(aes(x=fracture, y=height, col=as.factor(site_id))) + geom_boxplot() + ggtitle("Height for fracture type per site id")
plot_grid(bmi_frac_type, age_frac_type, weight_frac_type,height_frac_type, nrow=2, ncol=2)
Lets train an interpretable logistic regression using the lasso technique The point of this model is to be interpretable, meaning no exotic variables such as iteraction terms
## removed sub_id site_id phy_id
train.x <- model.matrix(fracture~ site_id + phy_id + priorfrac + age + weight + height + bmi + premeno + momfrac + armassist + smoke+ raterisk + fracscore + bonemed + bonemed_fu + bonetreat, train)
train.y<-train[,15]
nFolds = 10
set.seed(3)
foldid = sample(rep(seq(nFolds), length.out = nrow(train.x)))
lambdas_to_try <- 10^seq(-3, 5, length.out = 2000)
set.seed(3)
cvfit = cv.glmnet(train.x, train.y,
family = "binomial",
type.measure = "class",
lambda = lambdas_to_try,
nfolds = nFolds,
foldid = foldid)
plot(cvfit)
coef(cvfit, s = "lambda.min")
19 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 3.14581196
(Intercept) .
site_id 0.04744940
phy_id .
priorfracYes 0.27244168
age .
weight .
height -0.04162330
bmi 0.03343667
premenoYes 0.43710027
momfracYes 0.63341776
armassistYes .
smokeYes -0.62861341
rateriskSame 0.19988267
rateriskGreater 0.39124916
fracscore 0.17335813
bonemedYes 0.93958310
bonemed_fuYes 1.23851632
bonetreatYes -1.63890916
print("CV Error Rate:")
[1] "CV Error Rate:"
cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
[1] 0.24
#Optimal penalty
print("Penalty Value:")
[1] "Penalty Value:"
cvfit$lambda.min
[1] 0.003469526
build a final interpretable model based on feature selection and lambda value selected above
#For final model predictions go ahead and refit lasso using entire
#data set
#finalmodel = glmnet(train.x, train.y, family = "binomial",lambda=cvfit$lambda.min)
finalmodel<-glm(fracture ~ site_id + priorfrac + height + bmi + premeno + momfrac +
smoke + raterisk + raterisk + fracscore + bonemed +
bonemed_fu + bonetreat
, data=train,family = binomial(link="logit"))
coef(finalmodel)
(Intercept) site_id priorfracYes height bmi premenoYes momfracYes
3.74939105 0.05389650 0.30869979 -0.04715192 0.03726393 0.51497418 0.72431857
smokeYes rateriskSame rateriskGreater fracscore bonemedYes bonemed_fuYes bonetreatYes
-0.72391421 0.28238740 0.47088398 0.17672553 1.77392450 1.57985379 -2.81457532
confint(finalmodel)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) -3.171720694 10.846686131
site_id -0.083143760 0.191472608
priorfracYes -0.308857865 0.919463140
height -0.089846240 -0.006160707
bmi -0.006023961 0.080555865
premenoYes -0.116772035 1.131952986
momfracYes 0.010995312 1.422843336
smokeYes -2.035932212 0.349436980
rateriskSame -0.356038631 0.933732303
rateriskGreater -0.208169120 1.159624762
fracscore 0.056654052 0.299272430
bonemedYes 0.135972387 3.503486123
bonemed_fuYes 0.515414220 2.699537279
bonetreatYes -4.859074904 -0.855499389
summary(finalmodel)
Call:
glm(formula = fracture ~ site_id + priorfrac + height + bmi +
premeno + momfrac + smoke + raterisk + raterisk + fracscore +
bonemed + bonemed_fu + bonetreat, family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.77345 -0.72697 -0.50850 -0.00233 2.31716
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 3.74939 3.56438 1.052 0.29284
site_id 0.05390 0.06988 0.771 0.44056
priorfracYes 0.30870 0.31252 0.988 0.32326
height -0.04715 0.02128 -2.216 0.02669 *
bmi 0.03726 0.02200 1.694 0.09032 .
premenoYes 0.51497 0.31724 1.623 0.10453
momfracYes 0.72432 0.35861 2.020 0.04340 *
smokeYes -0.72391 0.59531 -1.216 0.22398
rateriskSame 0.28239 0.32773 0.862 0.38888
rateriskGreater 0.47088 0.34763 1.355 0.17556
fracscore 0.17673 0.06171 2.864 0.00418 **
bonemedYes 1.77392 0.82928 2.139 0.03243 *
bonemed_fuYes 1.57985 0.55009 2.872 0.00408 **
bonetreatYes -2.81458 1.00298 -2.806 0.00501 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449.87 on 399 degrees of freedom
Residual deviance: 387.07 on 386 degrees of freedom
AIC: 415.07
Number of Fisher Scoring iterations: 4
(vif(finalmodel)[,3])^2
site_id priorfrac height bmi premeno momfrac smoke raterisk fracscore bonemed bonemed_fu
1.041489 1.401640 1.092756 1.143599 1.088569 1.096608 1.044703 1.094237 1.456164 9.400026 4.361302
bonetreat
13.145499
vif(finalmodel)
GVIF Df GVIF^(1/(2*Df))
site_id 1.041489 1 1.020534
priorfrac 1.401640 1 1.183909
height 1.092756 1 1.045350
bmi 1.143599 1 1.069392
premeno 1.088569 1 1.043345
momfrac 1.096608 1 1.047191
smoke 1.044703 1 1.022107
raterisk 1.197355 2 1.046058
fracscore 1.456164 1 1.206716
bonemed 9.400026 1 3.065946
bonemed_fu 4.361302 1 2.088373
bonetreat 13.145499 1 3.625672
plot(finalmodel)
lets look at predictions for the lasso model also looking at the roc plot to select the most optimal threhold for classification
## removed sub_id
test.x = model.matrix(fracture~site_id+phy_id+priorfrac+age+weight+height+bmi+premeno+momfrac+armassist+ smoke+raterisk + fracscore +bonemed+bonemed_fu+bonetreat, test)
fit.pred.lasso = predict(finalmodel, newdata = test, type = "response")
hoslem.test(finalmodel$y, fitted(finalmodel), g=10)
Hosmer and Lemeshow goodness of fit (GOF) test
data: finalmodel$y, fitted(finalmodel)
X-squared = 2.2327, df = 8, p-value = 0.973
There is a large p-value so the test is a fit
results.lasso = prediction(fit.pred.lasso, test$fracture,
label.ordering=c("No","Yes"))
roc.lasso = performance(results.lasso, measure = "tpr", x.measure = "fpr")
plot(roc.lasso,colorize = TRUE)
abline(a=0, b= 1)
lets look at model performance metrics
cutoff<-0.3
class.lasso<-factor(ifelse(fit.pred.lasso>cutoff,"Yes","No"),levels=c("No","Yes"))
#Confusion Matrix for Lasso
conf.lasso<-table(class.lasso,test$fracture)
print("Confusion matrix for LASSO")
[1] "Confusion matrix for LASSO"
conf.lasso
class.lasso No Yes
No 55 13
Yes 20 12
precision <- posPredValue(class.lasso, test$fracture, positive="Yes")
recall <- sensitivity(class.lasso, test$fracture, positive="Yes")
F1 <- (2 * precision * recall) / (precision + recall)
print("accuracy")
[1] "accuracy"
mean(class.lasso==test$fracture)
[1] 0.67
print("precision")
[1] "precision"
precision
[1] 0.375
print("recall")
[1] "recall"
recall
[1] 0.48
print("F1")
[1] "F1"
F1
[1] 0.4210526
library(leaps)
nvmax = 17
reg_sq=regsubsets(fracture~.-sub_id,data=train, method="seqrep", nvmax=nvmax)
par(mfrow=c(2,2))
cp<-summary(reg_sq)$cp
plot(1:(nvmax),cp,type="l",ylab="CP",xlab="# of predictors")
index<-which(cp==min(cp))
points(index,cp[index],col="red",pch=10)
bics<-summary(reg_sq)$bic
plot(1:(nvmax),bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==-0.05839447)
points(index,bics[index],col="red",pch=10)
adjr2<-summary(reg_sq)$adjr2
plot(1:(nvmax),adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)
rss<-summary(reg_sq)$rss
plot(1:(nvmax),rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)
cbind(CP=summary(reg_sq)$cp,
r2=summary(reg_sq)$rsq,
Adj_r2=summary(reg_sq)$adjr2,
BIC=summary(reg_sq)$bic,
RSS = summary(reg_sq)$rss)
CP r2 Adj_r2 BIC RSS
[1,] 29.896467 0.06228673 0.05993067 -13.74149469 70.32850
[2,] 21.262665 0.08569960 0.08109355 -17.86404376 68.57253
[3,] 17.163256 0.09912891 0.09230413 -17.79138417 67.56533
[4,] 14.815639 0.10870123 0.09967542 -16.07291365 66.84741
[5,] 12.346001 0.11854221 0.10735620 -14.52247919 66.10933
[6,] 9.401752 0.12942816 0.11613699 -13.50174767 65.29289
[7,] 6.574529 0.14005645 0.12470032 -12.42369709 64.49577
[8,] 6.456459 0.14471989 0.12722056 -8.60732019 64.14601
[9,] 6.605170 0.14879595 0.12915278 -4.52671474 63.84030
[10,] 7.135831 0.15203105 0.13023236 -0.05839447 63.59767
[11,] 7.863885 0.15483155 0.13087058 4.60984770 63.38763
[12,] 9.203207 0.15628619 0.13012452 9.91226888 63.27854
[13,] 26.199933 0.12326730 0.09374003 31.25925645 65.75495
[14,] 28.025380 0.12365162 0.09178440 37.07534104 65.72613
[15,] 14.011020 0.15891107 0.12605604 26.64027837 63.08167
[16,] 26.238505 0.13639281 0.10031522 43.19999545 64.77054
[17,] 18.000000 0.15893534 0.12150576 38.61166880 63.07985
coef(reg_sq, 10)
(Intercept) priorfracYes weight bmi premenoYes momfracYes smokeYes fracscore bonemedYes
0.896192259 0.074990114 -0.009135076 0.029519214 0.078337622 0.147229030 -0.100225591 0.027287755 0.372353460
bonemed_fuYes bonetreatYes
0.356825487 -0.614896361
#To deal with the redundamcy, I would throw the cylinder variable out and then see what happens
model.main<-glm(fracture ~priorfrac+weight+bmi+premeno+momfrac+smoke+fracscore+bonemed+bonemed_fu+bonetreat, data=train,family = binomial(link="logit"))
summary(model.main)
Call:
glm(formula = fracture ~ priorfrac + weight + bmi + premeno +
momfrac + smoke + fracscore + bonemed + bonemed_fu + bonetreat,
family = binomial(link = "logit"), data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.68327 -0.74758 -0.51429 -0.00849 2.32662
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.42910 0.73446 -4.669 3.03e-06 ***
priorfracYes 0.37714 0.30346 1.243 0.21394
weight -0.05476 0.02293 -2.388 0.01695 *
bmi 0.17836 0.06172 2.890 0.00385 **
premenoYes 0.53354 0.31491 1.694 0.09022 .
momfracYes 0.81220 0.35193 2.308 0.02101 *
smokeYes -0.69295 0.59151 -1.171 0.24140
fracscore 0.17217 0.06114 2.816 0.00487 **
bonemedYes 1.87128 0.83048 2.253 0.02424 *
bonemed_fuYes 1.73253 0.53628 3.231 0.00124 **
bonetreatYes -2.93248 1.00520 -2.917 0.00353 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449.87 on 399 degrees of freedom
Residual deviance: 388.89 on 389 degrees of freedom
AIC: 410.89
Number of Fisher Scoring iterations: 4
exp(cbind("Odds ratio" = coef(model.main), confint.default(model.main, level = 0.95)))
Odds ratio 2.5 % 97.5 %
(Intercept) 0.03241615 0.007684079 0.1367511
priorfracYes 1.45811288 0.804427013 2.6429908
weight 0.94671625 0.905106185 0.9902392
bmi 1.19525393 1.059076423 1.3489413
premenoYes 1.70495430 0.919722940 3.1605922
momfracYes 2.25286510 1.130235590 4.4905692
smokeYes 0.50010048 0.156877764 1.5942380
fracscore 1.18787740 1.053723165 1.3391114
bonemedYes 6.49663181 1.275805335 33.0820257
bonemed_fuYes 5.65495838 1.976724475 16.1775476
bonetreatYes 0.05326457 0.007426895 0.3820054
vif(model.main)
priorfrac weight bmi premeno momfrac smoke fracscore bonemed bonemed_fu bonetreat
1.330137 9.180248 8.988158 1.075832 1.064687 1.038839 1.426069 9.429683 4.153687 13.202319
#Residual diagnostics can be obtained using
plot(model.main)
NA
fit.pred.step = predict(model.main, newdata = test, type = "response")
results.step = prediction(fit.pred.step, test$fracture,
label.ordering=c("No","Yes"))
roc.step = performance(results.step, measure = "tpr", x.measure = "fpr")
plot(roc.step, colorize = TRUE)
abline(a=0, b= 1)
lets look at model performance metrics
cutoff<-0.3
class.step<-factor(ifelse(fit.pred.step>cutoff,"Yes","No"),levels=c("No","Yes"))
#Confusion Matrix for Lasso
conf.step<-table(class.step,test$fracture)
print("Confusion matrix for LASSO")
[1] "Confusion matrix for LASSO"
conf.step
class.step No Yes
No 57 14
Yes 18 11
precision <- posPredValue(class.step, test$fracture, positive="Yes")
recall <- sensitivity(class.step, test$fracture, positive="Yes")
F1 <- (2 * precision * recall) / (precision + recall)
print("accuracy")
[1] "accuracy"
mean(class.step==test$fracture)
[1] 0.68
print("precision")
[1] "precision"
precision
[1] 0.3793103
print("recall")
[1] "recall"
recall
[1] 0.44
print("F1")
[1] "F1"
F1
[1] 0.4074074